import os 


SKIP_TYPES = ['Image','User_Talk','Category','User','Talk','Template'] 
WIKI_BASE_DIR = '/lfs1/users/sidshah/code/Wiki_Final/wikisimpy/snap_3_13_2008/bow_features/wiki_list_files'
READ_BASE_DIR = '/nfs/isd3/pantel/limbo/WIKIPEDIA/April_07_Static_Dump/'
STORE_BASE_DIR = 'wiki_textdump'


def normalizeFileName(fileName):
    
    fileName = fileName.replace("'","\'")
    return fileName

def isArticle(fileName):
    """
    This function is used to check if a file is wikipedia article based on its name
    """
    for eachType in SKIP_TYPES:
        if(fileName.find(eachType) != -1):
            return False
    return True


for eachFile in os.listdir(WIKI_BASE_DIR):
    for eachLine in open(os.path.abspath(os.path.join(WIKI_BASE_DIR,eachFile))).readlines():
        eachLine = eachLine.strip()
        if(isArticle(eachLine)):

            fileToSave = os.path.join(STORE_BASE_DIR,eachLine.replace("/","_"))
            os.system("lynx -dump \""+os.path.join(READ_BASE_DIR,eachLine+"\" >> \"")+fileToSave+"\"")
